Load the required libraries
library(tidyverse)
library(kableExtra)
library(lubridate)
library(forecast)
Load required Datasets
Creating a character object called months.abb
head(data_pro,10)
## # A tibble: 10 x 10
## X1 begins_at open_price close_price high_price low_price volume
## <dbl> <dttm> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 2015-04-28 00:00:00 24.9 24.9 24.9 24.9 0
## 2 1 2015-04-29 00:00:00 24.9 24.9 24.9 24.9 0
## 3 2 2015-04-30 00:00:00 24.9 24.9 24.9 24.9 0
## 4 3 2015-05-01 00:00:00 24.9 24.9 24.9 24.9 0
## 5 4 2015-05-04 00:00:00 24.9 24.9 24.9 24.9 0
## 6 5 2015-05-05 00:00:00 24.9 24.9 24.9 24.9 0
## 7 6 2015-05-06 00:00:00 24.9 24.9 24.9 24.9 0
## 8 7 2015-05-07 00:00:00 24.9 24.9 24.9 24.9 0
## 9 8 2015-05-08 00:00:00 24.9 24.9 24.9 24.9 0
## 10 9 2015-05-11 00:00:00 24.9 24.9 24.9 24.9 0
## # … with 3 more variables: session <chr>, interpolated <lgl>, sname <chr>
months.abb <- c("Jan","Feb","Mar",
"Apr","May","Jun",
"Jul","Aug","Sep",
"Oct","Nov","Dec")
Now, Let’s use the mutate() funtion in the dplyr package to calculate average price for the stocks
head(data_indus)
## # A tibble: 6 x 3
## Sector CODE NAME
## <chr> <chr> <chr>
## 1 Basic Materials Sector GPRE Green Plains Inc
## 2 Basic Materials Sector BCPC Balchem Corporation
## 3 Basic Materials Sector STLD Steel Dynamics, Inc
## 4 Basic Materials Sector METC Ramaco Resources, Inc
## 5 Basic Materials Sector ASH Ashland Global Holdings Inc
## 6 Basic Materials Sector TRX Tanzanian Gold Corporation
mutate(data_indus,avg_price = mean(data_pro[]))
## Warning in mean.default(data_pro[]): argument is not numeric or logical:
## returning NA
## # A tibble: 5,195 x 4
## Sector CODE NAME avg_price
## <chr> <chr> <chr> <dbl>
## 1 Basic Materials Sector GPRE Green Plains Inc NA
## 2 Basic Materials Sector BCPC Balchem Corporation NA
## 3 Basic Materials Sector STLD Steel Dynamics, Inc NA
## 4 Basic Materials Sector METC Ramaco Resources, Inc NA
## 5 Basic Materials Sector ASH Ashland Global Holdings Inc NA
## 6 Basic Materials Sector TRX Tanzanian Gold Corporation NA
## 7 Basic Materials Sector HCC Warrior Met Coal, Inc NA
## 8 Basic Materials Sector AUG Auryn Resources Inc NA
## 9 Basic Materials Sector RFP Resolute Forest Products Inc NA
## 10 Basic Materials Sector EGO Eldorado Gold Corporation NA
## # … with 5,185 more rows
# Avg Price of the Index
data_price <- data_pro %>% select(open_price,sname) %>%
group_by(sname) %>%
summarise(Avg_price = mean(open_price))
Yearly average price for the stocks
# yearly Price
data_price_year <- data_pro %>% select(begins_at,open_price,sname) %>%
group_by(Year=year(begins_at),sname) %>%
summarise(Avg_price = mean(open_price))
head(data_price)
## # A tibble: 6 x 2
## sname Avg_price
## <chr> <dbl>
## 1 AA 30.5
## 2 AAN 40.2
## 3 AAP 146.
## 4 AAT 40.6
## 5 AB 26.2
## 6 ABB 21.5
Monthly average price for the stocks
# Monthly Price
data_price_month <- data_pro %>% select(begins_at,open_price,sname) %>%
group_by(Year=year(begins_at),Month=months.abb[month(begins_at)],sname) %>%
summarise(Avg_price = mean(open_price))
head(data_price)
## # A tibble: 6 x 2
## sname Avg_price
## <chr> <dbl>
## 1 AA 30.5
## 2 AAN 40.2
## 3 AAP 146.
## 4 AAT 40.6
## 5 AB 26.2
## 6 ABB 21.5
data_price_year[which(data_price_year$sname=='AA'),]
## # A tibble: 6 x 3
## # Groups: Year [6]
## Year sname Avg_price
## <dbl> <chr> <dbl>
## 1 2015 AA 24.9
## 2 2016 AA 25.4
## 3 2017 AA 38.3
## 4 2018 AA 44.3
## 5 2019 AA 23.7
## 6 2020 AA 12.4
Let’s look at our data structure
glimpse(data_pro)
## Rows: 370,440
## Columns: 10
## $ X1 <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
## $ begins_at <dttm> 2015-04-28, 2015-04-29, 2015-04-30, 2015-05-01, 2015-05…
## $ open_price <dbl> 24.8871, 24.8871, 24.8871, 24.8871, 24.8871, 24.8871, 24…
## $ close_price <dbl> 24.8871, 24.8871, 24.8871, 24.8871, 24.8871, 24.8871, 24…
## $ high_price <dbl> 24.8871, 24.8871, 24.8871, 24.8871, 24.8871, 24.8871, 24…
## $ low_price <dbl> 24.8871, 24.8871, 24.8871, 24.8871, 24.8871, 24.8871, 24…
## $ volume <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ session <chr> "reg", "reg", "reg", "reg", "reg", "reg", "reg", "reg", …
## $ interpolated <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TR…
## $ sname <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A…
summary(data_pro)
## X1 begins_at open_price
## Min. : 0.0 Min. :2015-04-28 00:00:00 Min. : 0.00
## 1st Qu.: 314.8 1st Qu.:2016-07-26 18:00:00 1st Qu.: 13.61
## Median : 629.5 Median :2017-10-24 12:00:00 Median : 26.33
## Mean : 629.5 Mean :2017-10-25 14:51:25 Mean : 48.92
## 3rd Qu.: 944.2 3rd Qu.:2019-01-25 18:00:00 3rd Qu.: 56.09
## Max. :1259.0 Max. :2020-04-27 00:00:00 Max. :1266.56
## close_price high_price low_price volume
## Min. : 0.0052 Min. : 0.00 Min. : 0.0 Min. : 0
## 1st Qu.: 13.6100 1st Qu.: 13.72 1st Qu.: 13.5 1st Qu.: 103282
## Median : 26.3300 Median : 26.67 Median : 26.0 Median : 421838
## Mean : 48.9221 Mean : 49.43 Mean : 48.4 Mean : 1650980
## 3rd Qu.: 56.1200 3rd Qu.: 56.68 3rd Qu.: 55.5 3rd Qu.: 1332573
## Max. :1250.0000 Max. :1274.41 Max. :1232.0 Max. :375088650
## session interpolated sname
## Length:370440 Mode :logical Length:370440
## Class :character FALSE:362193 Class :character
## Mode :character TRUE :8247 Mode :character
##
##
##
data_pro %>%
summary() %>%
kable() %>%
kable_styling()
| X1 | begins_at | open_price | close_price | high_price | low_price | volume | session | interpolated | sname | |
|---|---|---|---|---|---|---|---|---|---|---|
| Min. : 0.0 | Min. :2015-04-28 00:00:00 | Min. : 0.00 | Min. : 0.0052 | Min. : 0.00 | Min. : 0.0 | Min. : 0 | Length:370440 | Mode :logical | Length:370440 | |
| 1st Qu.: 314.8 | 1st Qu.:2016-07-26 18:00:00 | 1st Qu.: 13.61 | 1st Qu.: 13.6100 | 1st Qu.: 13.72 | 1st Qu.: 13.5 | 1st Qu.: 103282 | Class :character | FALSE:362193 | Class :character | |
| Median : 629.5 | Median :2017-10-24 12:00:00 | Median : 26.33 | Median : 26.3300 | Median : 26.67 | Median : 26.0 | Median : 421838 | Mode :character | TRUE :8247 | Mode :character | |
| Mean : 629.5 | Mean :2017-10-25 14:51:25 | Mean : 48.92 | Mean : 48.9221 | Mean : 49.43 | Mean : 48.4 | Mean : 1650980 | NA | NA | NA | |
| 3rd Qu.: 944.2 | 3rd Qu.:2019-01-25 18:00:00 | 3rd Qu.: 56.09 | 3rd Qu.: 56.1200 | 3rd Qu.: 56.68 | 3rd Qu.: 55.5 | 3rd Qu.: 1332573 | NA | NA | NA | |
| Max. :1259.0 | Max. :2020-04-27 00:00:00 | Max. :1266.56 | Max. :1250.0000 | Max. :1274.41 | Max. :1232.0 | Max. :375088650 | NA | NA | NA |
Calculate number of ZERO’s in each variable in the dataset
data_pro %>%
gather(variable, value) %>%
filter(value == 0) %>%
group_by(variable) %>%
tally() %>%
mutate(percent = n / nrow(data_pro) * 100) %>%
mutate(percent = paste0(round(percent, ifelse(percent < 10, 1, 0)), "%")) %>%
arrange(desc(n)) %>%
# rename("Variable With Zeros"=variable,"Number of Records"=n,"Share of Total"=percent) %>%
kable() %>%
kable_styling()
## Warning: attributes are not identical across measure variables;
## they will be dropped
| variable | n | percent |
|---|---|---|
| volume | 8625 | 2.3% |
| X1 | 294 | 0.1% |
| high_price | 1 | 0% |
| low_price | 1 | 0% |
| open_price | 1 | 0% |
# we will not drop these now but we will review it latter. We will not use volume in our anlysis at this time so we will drop those data points latter.
Now, let’s only target shares whose open prices are between 100 and 200
unique(data_pro$sname)
## [1] "AA" "AAN" "AAP" "AAT" "AB" "ABB" "ABBV" "ABC" "ABEV"
## [10] "ABG" "ABM" "ABR" "ABT" "ACC" "ACCO" "ACH" "ACM" "ACN"
## [19] "ACP" "ACRE" "ACT" "ADC" "ADM" "ADPT" "ADS" "ADT" "ADX"
## [28] "AEB" "AEE" "AEG" "AEL" "AEM" "AEO" "AEP" "AER" "AES"
## [37] "AFC" "AFB" "AFG" "AFL" "AFSIA" "AFSIB" "AFSIC" "AFT" "AG"
## [46] "AGCO" "AGD" "AGI" "AGM" "AGN" "AGO" "AGRO" "AGX" "AHC"
## [55] "AHH" "AHT" "AI" "AIF" "AIG" "AIN" "AIR" "AIT" "AIV"
## [64] "AIW" "AIZ" "AJG" "AKR" "AL" "ALB" "ALE" "ALEX" "ALG"
## [73] "ALK" "ALL" "ALLE" "ALLY" "ALPN" "ALSN" "ALV" "ALX" "AM"
## [82] "AMC" "AME" "AMG" "AMH" "AMHC" "AMP" "AMRC" "AMT" "AMTD"
## [91] "AMX" "AN" "ANET" "ANF" "ANH" "ANTM" "AOD" "AON" "AOS"
## [100] "AP" "APA" "APAM" "APD" "APH" "APLE" "APO" "AR" "ARC"
## [109] "ARCO" "ARDC" "ARE" "ARES" "ARI" "ARL" "ARMK" "ARR" "ARW"
## [118] "ASA" "ASB" "ASC" "ASG" "ASGN" "ASH" "ASPN" "ASR" "ASX"
## [127] "AT" "ATEN" "ATHM" "ATI" "ATLS" "ATO" "ATR" "ATTO" "ATV"
## [136] "AU" "AUY" "AVA" "AVAL" "AVB" "AVD" "AVH" "AVK" "AVT"
## [145] "AVY" "AWF" "AWI" "AWK" "AWP" "AWR" "AXE" "AXL" "AXP"
## [154] "AXR" "AXS" "AXTA" "AYI" "AZN" "AZO" "AZZ" "B" "BA"
## [163] "BABA" "BAC" "BAF" "BAH" "BAK" "BAM" "BANC" "BAP" "BAX"
## [172] "BBD" "BBDO" "BBF" "BBK" "BBL" "BBN" "BBVA" "BBW" "BBX"
## [181] "BBY" "BC" "BCC" "BCE" "BCEI" "BCH" "BCO" "BCS" "BCX"
## [190] "BDC" "BDJ" "BDN" "BDX" "BEN" "BEP" "BERY" "BFAM" "BFK"
## [199] "BFO" "BFS" "BFZ" "BG" "BGB" "BGG" "BGH" "BGR" "BGS"
## [208] "BGT" "BGX" "BGY" "BH" "BHE" "BHK" "BHLB" "BHP" "BIF"
## [217] "BIG" "BIO" "BIP" "BIT" "BITA" "BK" "BKD" "BKE" "BKH"
## [226] "BKK" "BKN" "BKT" "BKU" "BLK" "BLL" "BLW" "BLX" "BMA"
## [235] "BME" "BMI" "BMO" "BMY" "BNS" "BNY" "BOE" "BOH" "BOOT"
## [244] "BP" "BPT" "BPY" "BQH" "BR" "BRC" "BRFS" "BRO" "BRP"
## [253] "BRT" "BRX" "BSAC" "BSBR" "BSD" "BSE" "BSL" "BSMX" "BST"
## [262] "BSX" "BTA" "BTE" "BTO" "BTT" "BTU" "BTZ" "BUD" "BUI"
## [271] "BURL" "BVN" "BWA" "BWG" "BX" "BXC" "BXMT" "BXMX" "BXP"
## [280] "BXS" "BYD" "BYM" "BZH" "C" "CC" "CL" "CN" "CP"
## [289] "CB" "CACI" "CAE"
data_pro %>% filter(open_price > 100 & open_price < 200 ) %>% select(open_price,sname) %>% group_by(sname) %>% summarise(Avg_price = mean(open_price)) %>% ggplot(mapping = aes(x=sname,y= Avg_price)) + geom_col()
The graph above tells us about average price of stocks for each sector name.
Cheking only with 500 stokcs data and analyze the distribution of data in each sector
left_join(data_price,data_indus,by = c("sname"="CODE")) %>% filter(!is.na(Sector)) %>% ggplot(mapping = aes(Sector)) + geom_bar() + theme(axis.text.x = element_text(angle = 70, hjust = 1))
First graph tells us about increase in count of sectors, division by sectors
spread(data_price_year,Year,Avg_price) %>% left_join(.,data_indus,by = c("sname"="CODE")) %>% filter(!is.na(Sector)) %>% pivot_longer(c('2015','2016','2017','2018','2019','2020'),"Year",values_to = "Avg_price" ) %>% ggplot(mapping = aes(Sector,col=Year)) + geom_bar() + theme(axis.text.x = element_text(angle = 70, hjust = 1)) +facet_wrap('Year')
The graph above tells us about yearly increase in count of sectors
spread(data_price_year,Year,Avg_price) %>% left_join(.,data_indus,by = c("sname"="CODE")) %>% filter(!is.na(Sector)) %>% pivot_longer(c('2015','2016','2017','2018','2019','2020'),"Year",values_to = "Avg_price" ) %>% ggplot(mapping = aes(Sector,Avg_price,fill=Year)) + geom_col() + theme(axis.text.x = element_text(angle = 70, hjust = 1))
#+facet_wrap('Year')
Third graph tells us about yearly increase in average price of sectors
Below is Graph of the sector by Month and year,which shows some pattern
We will do some analysis to see how stocks from few of these industries fit with AR(Auto Regression) and MA(Moving Average) model.
Analyzing average price of stocks yearly for each sector in the dataset
spread(data_price_month,Year,Avg_price) %>% left_join(.,data_indus,by = c("sname"="CODE")) %>% filter(!is.na(Sector)) %>% pivot_longer(c('2015','2016','2017','2018','2019','2020'),"Year",values_to = "Avg_price" ) %>% filter(Avg_price > 100 & Avg_price < 200 )%>% ggplot(mapping = aes(Sector,Avg_price,fill=Year)) + geom_col(position = "dodge") + theme(axis.text.x = element_text(angle = 70, hjust = 1))
#+ geom_bar(position = "dodge2")
#+facet_wrap('Month')
Analyzing average price of stocks monthly for each sector in the dataset
# By Month
spread(data_price_month,Year,Avg_price) %>% left_join(.,data_indus,by = c("sname"="CODE")) %>% filter(!is.na(Sector)) %>% pivot_longer(c('2015','2016','2017','2018','2019','2020'),"Year",values_to = "Avg_price" ) %>% ggplot(mapping = aes(Sector,Avg_price,fill=Month)) + geom_col(position = "dodge") + theme(axis.text.x = element_text(angle = 70, hjust = 1))
## Warning: Removed 2904 rows containing missing values (geom_col).
Box plot for year
# Box PLot for year
spread(data_price_month,Year,Avg_price) %>% left_join(.,data_indus,by = c("sname"="CODE")) %>% filter(!is.na(Sector)) %>% pivot_longer(c('2015','2016','2017','2018','2019','2020'),"Year",values_to = "Avg_price" ) %>% ggplot(mapping = aes( stringr::str_remove(Sector,'Sector') ,Avg_price,fill=Year)) + geom_boxplot(position = "dodge") + theme(axis.text.x = element_text(angle = 30, hjust = 1)) + labs(title = "Boxplot of Stocks by Year ")+ ylim(0,200) + xlab( "Sector")
## Warning: Removed 3518 rows containing non-finite values (stat_boxplot).
# ----- TEMP
# spread(data_price_month,Year,Avg_price) %>% left_join(.,data_indus,by = c("sname"="CODE")) %>% filter(!is.na(Sector)) %>% pivot_longer(c('2015','2016','2017','2018','2019','2020'),"Year",values_to = "Avg_price" ) %>% ggplot(mapping = aes(Month,Avg_price,fill=Year)) + geom_col() + theme(axis.text.x = element_text(angle = 70, hjust = 1)) + geom_bar(position = "dodge2")
# # ggplot(mapping = aes(Avg_price,fill=Month)) + geom_histogram(position = "fill")
# # ggplot( aes(Month, Avg_price)) + geom_area(aes(fill = Month))
# # ggplot(mapping = aes(Month , Avg_price,group=Sector)) + geom_line(aes(colour = Sector), position = "stack") + geom_point(aes(colour = Sector), position = "stack") + geom_area(aes(fill = Sector))
From the box plot above we can analyze that mostly all the sectors in our dataset have some outliers throughout 5 years, except two sectors that are: Technology and Utilities.
Analyzing top 3 stocks in each Sector
library(dplyr)
data_price_month %>% group_by(sname) %>% summarise(Avg = mean(Avg_price)) %>% left_join(.,data_indus,by = c("sname"="CODE")) %>% filter(!is.na(Sector)) %>% arrange(desc(Avg)) %>% top_n(n=16,wt = Avg)
## # A tibble: 16 x 4
## sname Avg Sector NAME
## <chr> <dbl> <chr> <chr>
## 1 AZO 801. Consumer Cyclical Sec… AutoZone, Inc
## 2 AZO 801. Consumer Defensive Se… AutoZone, Inc
## 3 BLK 420. Financial Services Se… BlackRock, Inc
## 4 BH 292. Consumer Cyclical Sec… Biglari Holdings Inc
## 5 BH 292. Consumer Defensive Se… Biglari Holdings Inc
## 6 BA 245. Industrials Sector The Boeing Company
## 7 BIO 234. Healthcare Sector Bio-Rad Laboratories, Inc
## 8 AGN 212. Healthcare Sector Allergan plc
## 9 ADS 208. Financial Services Se… Alliance Data Systems Corporation
## 10 ANTM 206. Healthcare Sector Anthem, Inc
## 11 BDX 203. Healthcare Sector Becton, Dickinson and Company
## 12 BAP 182. Financial Services Se… Credicorp Ltd
## 13 CP 179. Industrials Sector Canadian Pacific Railway Limited
## 14 AYI 172. Industrials Sector Acuity Brands, Inc
## 15 ANET 169. Technology Sector Arista Networks, Inc
## 16 ASR 164. Industrials Sector Grupo Aeroportuario del Sureste, S. A. B.…
data_stock = data_price_month %>% group_by(sname) %>% summarise(Avg = mean(Avg_price)) %>% left_join(.,data_indus,by = c("sname"="CODE")) %>% filter(!is.na(Sector)) %>% arrange(desc(Avg)) %>% top_n(n=20,wt = Avg)
head(data_stock)
## # A tibble: 6 x 4
## sname Avg Sector NAME
## <chr> <dbl> <chr> <chr>
## 1 AZO 801. Consumer Cyclical Sector AutoZone, Inc
## 2 AZO 801. Consumer Defensive Sector AutoZone, Inc
## 3 BLK 420. Financial Services Sector BlackRock, Inc
## 4 BH 292. Consumer Cyclical Sector Biglari Holdings Inc
## 5 BH 292. Consumer Defensive Sector Biglari Holdings Inc
## 6 BA 245. Industrials Sector The Boeing Company
unique(cbind(data_stock$NAME,data_stock$sname,data_stock$Sector))
## [,1] [,2]
## [1,] "AutoZone, Inc" "AZO"
## [2,] "AutoZone, Inc" "AZO"
## [3,] "BlackRock, Inc" "BLK"
## [4,] "Biglari Holdings Inc" "BH"
## [5,] "Biglari Holdings Inc" "BH"
## [6,] "The Boeing Company" "BA"
## [7,] "Bio-Rad Laboratories, Inc" "BIO"
## [8,] "Allergan plc" "AGN"
## [9,] "Alliance Data Systems Corporation" "ADS"
## [10,] "Anthem, Inc" "ANTM"
## [11,] "Becton, Dickinson and Company" "BDX"
## [12,] "Credicorp Ltd" "BAP"
## [13,] "Canadian Pacific Railway Limited" "CP"
## [14,] "Acuity Brands, Inc" "AYI"
## [15,] "Arista Networks, Inc" "ANET"
## [16,] "Grupo Aeroportuario del Sureste, S. A. B. de C. V" "ASR"
## [17,] "Air Products and Chemicals, Inc" "APD"
## [18,] "CACI International Inc" "CACI"
## [19,] "Advance Auto Parts, Inc" "AAP"
## [20,] "Advance Auto Parts, Inc" "AAP"
## [,3]
## [1,] "Consumer Cyclical Sector"
## [2,] "Consumer Defensive Sector"
## [3,] "Financial Services Sector"
## [4,] "Consumer Cyclical Sector"
## [5,] "Consumer Defensive Sector"
## [6,] "Industrials Sector"
## [7,] "Healthcare Sector"
## [8,] "Healthcare Sector"
## [9,] "Financial Services Sector"
## [10,] "Healthcare Sector"
## [11,] "Healthcare Sector"
## [12,] "Financial Services Sector"
## [13,] "Industrials Sector"
## [14,] "Industrials Sector"
## [15,] "Technology Sector"
## [16,] "Industrials Sector"
## [17,] "Basic Materials Sector"
## [18,] "Technology Sector"
## [19,] "Consumer Cyclical Sector"
## [20,] "Consumer Defensive Sector"
# ggplot(mapping = aes(Month,Avg_price,fill=Year)) + geom_col() + theme(axis.text.x = element_text(angle = 70, hjust = 1)) + geom_bar(position = "dodge2")
We will study the flow on some of the stocks from Health and Tech Sectors like:
ANTM Anthem, Inc
ANET Arista Networks, Inc
BA The Boeing Company
library(stringr)
spread(data_price_month,Year,Avg_price) %>% left_join(.,data_indus,by = c("sname"="CODE"))%>% filter(!is.na(Sector))
## # A tibble: 3,168 x 10
## # Groups: Month [12]
## Month sname `2015` `2016` `2017` `2018` `2019` `2020` Sector NAME
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 Apr AA 24.9 24.9 33.4 52.2 28.3 7.12 Basic Mate… Alcoa Corp…
## 2 Apr AAN 34.2 26.3 30.7 46.1 53.8 23.3 Industrial… Aaron's, I…
## 3 Apr AAP 145. 159. 144. 111. 175. 107. Consumer C… Advance Au…
## 4 Apr AAP 145. 159. 144. 111. 175. 107. Consumer D… Advance Au…
## 5 Apr AB 31.2 23.9 22.9 26.6 29.4 19.7 Financial … AllianceBe…
## 6 Apr ABB 21.8 20.0 23.3 23.3 20.1 17.4 Industrial… ABB Ltd
## 7 Apr ABBV 65.2 59.6 64.8 93.2 81.0 79.6 Healthcare… AbbVie Inc
## 8 Apr ABC 115. 88.0 84.9 88.8 75.1 87.4 Healthcare… Amerisourc…
## 9 Apr ABG 86.0 58.1 59.9 67.8 74.9 53.4 Consumer C… Asbury Aut…
## 10 Apr ABG 86.0 58.1 59.9 67.8 74.9 53.4 Consumer D… Asbury Aut…
## # … with 3,158 more rows
summary(data_pro)
## X1 begins_at open_price
## Min. : 0.0 Min. :2015-04-28 00:00:00 Min. : 0.00
## 1st Qu.: 314.8 1st Qu.:2016-07-26 18:00:00 1st Qu.: 13.61
## Median : 629.5 Median :2017-10-24 12:00:00 Median : 26.33
## Mean : 629.5 Mean :2017-10-25 14:51:25 Mean : 48.92
## 3rd Qu.: 944.2 3rd Qu.:2019-01-25 18:00:00 3rd Qu.: 56.09
## Max. :1259.0 Max. :2020-04-27 00:00:00 Max. :1266.56
## close_price high_price low_price volume
## Min. : 0.0052 Min. : 0.00 Min. : 0.0 Min. : 0
## 1st Qu.: 13.6100 1st Qu.: 13.72 1st Qu.: 13.5 1st Qu.: 103282
## Median : 26.3300 Median : 26.67 Median : 26.0 Median : 421838
## Mean : 48.9221 Mean : 49.43 Mean : 48.4 Mean : 1650980
## 3rd Qu.: 56.1200 3rd Qu.: 56.68 3rd Qu.: 55.5 3rd Qu.: 1332573
## Max. :1250.0000 Max. :1274.41 Max. :1232.0 Max. :375088650
## session interpolated sname
## Length:370440 Mode :logical Length:370440
## Class :character FALSE:362193 Class :character
## Mode :character TRUE :8247 Mode :character
##
##
##
# Only Keeping Date, open_price , sname , interpolated = FALSE
glimpse((data_pro))
## Rows: 370,440
## Columns: 10
## $ X1 <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
## $ begins_at <dttm> 2015-04-28, 2015-04-29, 2015-04-30, 2015-05-01, 2015-05…
## $ open_price <dbl> 24.8871, 24.8871, 24.8871, 24.8871, 24.8871, 24.8871, 24…
## $ close_price <dbl> 24.8871, 24.8871, 24.8871, 24.8871, 24.8871, 24.8871, 24…
## $ high_price <dbl> 24.8871, 24.8871, 24.8871, 24.8871, 24.8871, 24.8871, 24…
## $ low_price <dbl> 24.8871, 24.8871, 24.8871, 24.8871, 24.8871, 24.8871, 24…
## $ volume <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ session <chr> "reg", "reg", "reg", "reg", "reg", "reg", "reg", "reg", …
## $ interpolated <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TR…
## $ sname <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A…
data_Main <- data_pro %>% filter(!interpolated == TRUE )%>% .[,c(2,3,10)] %>%
subset( sname %in% data_stock$sname)
head(data_Main)
## # A tibble: 6 x 3
## begins_at open_price sname
## <dttm> <dbl> <chr>
## 1 2015-04-28 00:00:00 145 AAP
## 2 2015-04-29 00:00:00 144. AAP
## 3 2015-04-30 00:00:00 144. AAP
## 4 2015-05-01 00:00:00 143. AAP
## 5 2015-05-04 00:00:00 145. AAP
## 6 2015-05-05 00:00:00 145. AAP
Converting the data of stokcs in wide format
wide_data_Main <- spread(data_Main,sname,open_price)
head(wide_data_Main)
## # A tibble: 6 x 18
## begins_at AAP ADS AGN ANET ANTM APD ASR AYI AZO
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2015-04-28 00:00:00 145 303 282. 64.8 150. 139. 152. 167. 692.
## 2 2015-04-29 00:00:00 144. 301. 286. 65.7 156. 140. 151. 165. 690
## 3 2015-04-30 00:00:00 144. 300. 287. 64.6 152. 142. 150. 169. 683.
## 4 2015-05-01 00:00:00 143. 300. 285. 64.1 152. 133. 145. 167. 675.
## 5 2015-05-04 00:00:00 145. 300. 290. 64.2 153. 137. 147. 170. 683
## 6 2015-05-05 00:00:00 145. 300 290. 64.7 155. 137. 144. 173. 682
## # … with 8 more variables: BA <dbl>, BAP <dbl>, BDX <dbl>, BH <dbl>, BIO <dbl>,
## # BLK <dbl>, CACI <dbl>, CP <dbl>
Fit an AR model to the follwing data:
ANTM Anthem, Inc
ANET Arista Networks, Inc
BA The Boeing Company
library(xts)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
# wide_data_Main$begins_at <- as_datetime(wide_data_Main$begins_at)
stocks_ANTM <-xts(wide_data_Main$ANTM, order.by=as.Date(wide_data_Main$begins_at))
stocks_ANET <- xts(wide_data_Main$ANET, order.by=as.Date(wide_data_Main$begins_at))
stocks_BA <- xts(wide_data_Main$BA, order.by=as.Date(wide_data_Main$begins_at))
# Data for only 2020 data
wide_data_Main_20 <- wide_data_Main[which(year(wide_data_Main$begins_at) %in% c("2020")),]
# Data for only Rest than 2020 data
wide_data_Main_Old <- wide_data_Main[-which(year(wide_data_Main$begins_at) %in% c("2020")),]
stocks_ANTM_MY <-xts(wide_data_Main_20$ANTM, order.by=as.Date(wide_data_Main_20$begins_at))
stocks_ANET_MY <- xts(wide_data_Main_20$ANET, order.by=as.Date(wide_data_Main_20$begins_at))
stocks_BA_MY <- xts(wide_data_Main_20$BA, order.by=as.Date(wide_data_Main_20$begins_at))
stocks_ANTM_old <-xts(wide_data_Main_Old$ANTM, order.by=as.Date(wide_data_Main_Old$begins_at))
stocks_ANET_old <- xts(wide_data_Main_Old$ANET, order.by=as.Date(wide_data_Main_Old$begins_at))
stocks_BA_old <- xts(wide_data_Main_Old$BA, order.by=as.Date(wide_data_Main_Old$begins_at))
index(stocks_ANTM_MY)
## [1] "2020-01-02" "2020-01-03" "2020-01-06" "2020-01-07" "2020-01-08"
## [6] "2020-01-09" "2020-01-10" "2020-01-13" "2020-01-14" "2020-01-15"
## [11] "2020-01-16" "2020-01-17" "2020-01-21" "2020-01-22" "2020-01-23"
## [16] "2020-01-24" "2020-01-27" "2020-01-28" "2020-01-29" "2020-01-30"
## [21] "2020-01-31" "2020-02-03" "2020-02-04" "2020-02-05" "2020-02-06"
## [26] "2020-02-07" "2020-02-10" "2020-02-11" "2020-02-12" "2020-02-13"
## [31] "2020-02-14" "2020-02-18" "2020-02-19" "2020-02-20" "2020-02-21"
## [36] "2020-02-24" "2020-02-25" "2020-02-26" "2020-02-27" "2020-02-28"
## [41] "2020-03-02" "2020-03-03" "2020-03-04" "2020-03-05" "2020-03-06"
## [46] "2020-03-09" "2020-03-10" "2020-03-11" "2020-03-12" "2020-03-13"
## [51] "2020-03-16" "2020-03-17" "2020-03-18" "2020-03-19" "2020-03-20"
## [56] "2020-03-23" "2020-03-24" "2020-03-25" "2020-03-26" "2020-03-27"
## [61] "2020-03-30" "2020-03-31" "2020-04-01" "2020-04-02" "2020-04-03"
## [66] "2020-04-06" "2020-04-07" "2020-04-08" "2020-04-09" "2020-04-13"
## [71] "2020-04-14" "2020-04-15" "2020-04-16" "2020-04-17" "2020-04-20"
## [76] "2020-04-21" "2020-04-22" "2020-04-23" "2020-04-24" "2020-04-27"
coredata(stocks_ANTM_MY)
## [,1]
## [1,] 302.67
## [2,] 293.68
## [3,] 295.75
## [4,] 299.20
## [5,] 300.89
## [6,] 307.83
## [7,] 307.94
## [8,] 306.88
## [9,] 296.04
## [10,] 297.26
## [11,] 302.92
## [12,] 305.07
## [13,] 303.94
## [14,] 306.68
## [15,] 303.46
## [16,] 304.84
## [17,] 293.63
## [18,] 285.00
## [19,] 279.50
## [20,] 269.26
## [21,] 265.68
## [22,] 266.11
## [23,] 269.90
## [24,] 275.36
## [25,] 288.78
## [26,] 279.70
## [27,] 275.88
## [28,] 277.04
## [29,] 286.27
## [30,] 294.73
## [31,] 299.29
## [32,] 297.45
## [33,] 302.01
## [34,] 301.38
## [35,] 292.03
## [36,] 283.49
## [37,] 279.21
## [38,] 269.84
## [39,] 263.06
## [40,] 249.94
## [41,] 259.60
## [42,] 270.13
## [43,] 286.55
## [44,] 287.65
## [45,] 278.18
## [46,] 264.00
## [47,] 278.33
## [48,] 278.60
## [49,] 262.03
## [50,] 267.50
## [51,] 226.50
## [52,] 229.76
## [53,] 224.45
## [54,] 206.22
## [55,] 204.29
## [56,] 188.54
## [57,] 183.98
## [58,] 190.09
## [59,] 221.28
## [60,] 217.61
## [61,] 224.66
## [62,] 235.28
## [63,] 217.01
## [64,] 210.53
## [65,] 208.89
## [66,] 215.51
## [67,] 235.70
## [68,] 227.15
## [69,] 240.94
## [70,] 240.78
## [71,] 245.76
## [72,] 249.15
## [73,] 254.51
## [74,] 279.01
## [75,] 262.48
## [76,] 255.00
## [77,] 255.09
## [78,] 264.71
## [79,] 265.48
## [80,] 267.56
first(stocks_ANTM_MY)
## [,1]
## 2020-01-02 302.67
stocks_ANTM_MY[]
## [,1]
## 2020-01-02 302.67
## 2020-01-03 293.68
## 2020-01-06 295.75
## 2020-01-07 299.20
## 2020-01-08 300.89
## 2020-01-09 307.83
## 2020-01-10 307.94
## 2020-01-13 306.88
## 2020-01-14 296.04
## 2020-01-15 297.26
## 2020-01-16 302.92
## 2020-01-17 305.07
## 2020-01-21 303.94
## 2020-01-22 306.68
## 2020-01-23 303.46
## 2020-01-24 304.84
## 2020-01-27 293.63
## 2020-01-28 285.00
## 2020-01-29 279.50
## 2020-01-30 269.26
## 2020-01-31 265.68
## 2020-02-03 266.11
## 2020-02-04 269.90
## 2020-02-05 275.36
## 2020-02-06 288.78
## 2020-02-07 279.70
## 2020-02-10 275.88
## 2020-02-11 277.04
## 2020-02-12 286.27
## 2020-02-13 294.73
## 2020-02-14 299.29
## 2020-02-18 297.45
## 2020-02-19 302.01
## 2020-02-20 301.38
## 2020-02-21 292.03
## 2020-02-24 283.49
## 2020-02-25 279.21
## 2020-02-26 269.84
## 2020-02-27 263.06
## 2020-02-28 249.94
## 2020-03-02 259.60
## 2020-03-03 270.13
## 2020-03-04 286.55
## 2020-03-05 287.65
## 2020-03-06 278.18
## 2020-03-09 264.00
## 2020-03-10 278.33
## 2020-03-11 278.60
## 2020-03-12 262.03
## 2020-03-13 267.50
## 2020-03-16 226.50
## 2020-03-17 229.76
## 2020-03-18 224.45
## 2020-03-19 206.22
## 2020-03-20 204.29
## 2020-03-23 188.54
## 2020-03-24 183.98
## 2020-03-25 190.09
## 2020-03-26 221.28
## 2020-03-27 217.61
## 2020-03-30 224.66
## 2020-03-31 235.28
## 2020-04-01 217.01
## 2020-04-02 210.53
## 2020-04-03 208.89
## 2020-04-06 215.51
## 2020-04-07 235.70
## 2020-04-08 227.15
## 2020-04-09 240.94
## 2020-04-13 240.78
## 2020-04-14 245.76
## 2020-04-15 249.15
## 2020-04-16 254.51
## 2020-04-17 279.01
## 2020-04-20 262.48
## 2020-04-21 255.00
## 2020-04-22 255.09
## 2020-04-23 264.71
## 2020-04-24 265.48
## 2020-04-27 267.56
tail(stocks_ANTM_MY,n=10)
## [,1]
## 2020-04-14 245.76
## 2020-04-15 249.15
## 2020-04-16 254.51
## 2020-04-17 279.01
## 2020-04-20 262.48
## 2020-04-21 255.00
## 2020-04-22 255.09
## 2020-04-23 264.71
## 2020-04-24 265.48
## 2020-04-27 267.56
endpoints(stocks_ANTM_MY,on="months")
## [1] 0 21 40 62 80
stocks_ANTM_MY['2020-04-14']
## [,1]
## 2020-04-14 245.76
month.abb[month(index(stocks_ANTM_MY))]
## [1] "Jan" "Jan" "Jan" "Jan" "Jan" "Jan" "Jan" "Jan" "Jan" "Jan" "Jan" "Jan"
## [13] "Jan" "Jan" "Jan" "Jan" "Jan" "Jan" "Jan" "Jan" "Jan" "Feb" "Feb" "Feb"
## [25] "Feb" "Feb" "Feb" "Feb" "Feb" "Feb" "Feb" "Feb" "Feb" "Feb" "Feb" "Feb"
## [37] "Feb" "Feb" "Feb" "Feb" "Mar" "Mar" "Mar" "Mar" "Mar" "Mar" "Mar" "Mar"
## [49] "Mar" "Mar" "Mar" "Mar" "Mar" "Mar" "Mar" "Mar" "Mar" "Mar" "Mar" "Mar"
## [61] "Mar" "Mar" "Apr" "Apr" "Apr" "Apr" "Apr" "Apr" "Apr" "Apr" "Apr" "Apr"
## [73] "Apr" "Apr" "Apr" "Apr" "Apr" "Apr" "Apr" "Apr"
nmonths(stocks_ANTM_MY)
## [1] 4
head(stocks_ANTM_MY,n=10)
## [,1]
## 2020-01-02 302.67
## 2020-01-03 293.68
## 2020-01-06 295.75
## 2020-01-07 299.20
## 2020-01-08 300.89
## 2020-01-09 307.83
## 2020-01-10 307.94
## 2020-01-13 306.88
## 2020-01-14 296.04
## 2020-01-15 297.26
to.weekly(stocks_ANTM_MY)%>% as.data.frame %>% cbind(.,yr=week(index(to.weekly(stocks_ANTM_MY))))%>% pivot_longer(c(stocks_ANTM_MY.Open,stocks_ANTM_MY.High,stocks_ANTM_MY.Low,stocks_ANTM_MY.Close),names_to="Key" , values_to = "Prc") %>% ggplot(aes(x=yr, y= Prc,fill=Key))+ geom_line(aes(colour=Key)) #+geom_col(position="dodge", alpha=0.5)
In the above graph, it shows weekly change in Anthem stock data i.e. open price, close price, high price, low price.
to.monthly(stocks_ANTM_MY) %>% as.data.frame %>% cbind(.,yr=index(to.monthly(stocks_ANTM_MY)))%>% pivot_longer(c(stocks_ANTM_MY.Open,stocks_ANTM_MY.High,stocks_ANTM_MY.Low,stocks_ANTM_MY.Close),names_to="Key" , values_to = "Prc") %>% ggplot(aes(x=yr, y= Prc,fill=Key))+ geom_line(aes(colour=Key)) +geom_col(position="dodge", alpha=0.5)
In the above graph, it shows monthly change in Anthem stock data i.e. open price, close price, high price, low price.
Periodicity of Anthem Stocks data
periodicity(stocks_ANTM_MY)
## Daily periodicity from 2020-01-02 to 2020-04-27
str(stocks_ANTM_MY)
## An 'xts' object on 2020-01-02/2020-04-27 containing:
## Data: num [1:80, 1] 303 294 296 299 301 ...
## Indexed by objects of class: [Date] TZ: UTC
## xts Attributes:
## NULL
week(index(to.weekly(stocks_ANTM_MY)))
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 17
stocks_ANTM_MY[c("2020-01-02","2020-01-03")]
## [,1]
## 2020-01-02 302.67
## 2020-01-03 293.68
stocks_ANTM_MY[str_which(index(stocks_ANTM_MY),"\\d+-\\d+-03")]
## [,1]
## 2020-01-03 293.68
## 2020-02-03 266.11
## 2020-03-03 270.13
## 2020-04-03 208.89
With the commands head() and tail() we can see the first and last 6 lines of the base. There are 6 columns with: opening price, maximum and minimum prices, closing price, volume of transactions and adjusted price. Using the command summary() we verify the descriptive statistics of each price series and volume. The command str() returns the object structure. In this case, it’s a xts object, a time series.
library(forecast)
head(stocks_ANTM)
## [,1]
## 2015-04-28 150.15
## 2015-04-29 155.96
## 2015-04-30 151.83
## 2015-05-01 151.92
## 2015-05-04 153.45
## 2015-05-05 154.67
tail(stocks_ANTM)
## [,1]
## 2020-04-20 262.48
## 2020-04-21 255.00
## 2020-04-22 255.09
## 2020-04-23 264.71
## 2020-04-24 265.48
## 2020-04-27 267.56
summary(stocks_ANTM)
## Index stocks_ANTM
## Min. :2015-04-28 Min. :117.0
## 1st Qu.:2016-07-26 1st Qu.:145.2
## Median :2017-10-24 Median :195.9
## Mean :2017-10-25 Mean :206.5
## 3rd Qu.:2019-01-26 3rd Qu.:262.8
## Max. :2020-04-27 Max. :317.6
str(stocks_ANTM)
## An 'xts' object on 2015-04-28/2020-04-27 containing:
## Data: num [1:1259, 1] 150 156 152 152 153 ...
## Indexed by objects of class: [Date] TZ: UTC
## xts Attributes:
## NULL
plot(stocks_BA)
acf(stocks_ANTM)
acf_ANTM = acf(stocks_ANTM,lag.max = 30)
acf_BA = acf(stocks_BA,lag.max = 30)
acf_ANET = acf(stocks_ANET,lag.max = 30)
acf_BA= acf(stocks_BA,lag.max = 30,plot = FALSE)
autoplot(stocks_ANTM)
frequency(stocks_ANTM)
## [1] 1
Plot for 2020 Data only
acf(stocks_ANTM_MY,lag.max = 30)
autoplot(stocks_ANTM_MY)
head(stocks_ANTM_MY)
## [,1]
## 2020-01-02 302.67
## 2020-01-03 293.68
## 2020-01-06 295.75
## 2020-01-07 299.20
## 2020-01-08 300.89
## 2020-01-09 307.83
frequency(stocks_ANTM_MY)
## [1] 1
For for rest of the data before 2020
Plot for 2020 Data only
acf(stocks_ANTM_old)
plot(stocks_ANTM_old)
head(stocks_ANTM_old)
## [,1]
## 2015-04-28 150.15
## 2015-04-29 155.96
## 2015-04-30 151.83
## 2015-05-01 151.92
## 2015-05-04 153.45
## 2015-05-05 154.67
frequency(stocks_ANTM_old)
## [1] 1
The ACF plots test if an individual lag autocorrelation is different than zero. An alternative approach is to use the Ljung-Box test, which tests whether any of a group of autocorrelations of a time series are different from zero.
In essence it tests the “overall randomness” based on a number of lags. If the result is a small p-value than it indicates the data are probably not white noise.
For 2020 Data
Box.test(wide_data_Main_20$ANTM, lag = 30, fitdf = 0, type = "Lj")
##
## Box-Ljung test
##
## data: wide_data_Main_20$ANTM
## X-squared = 480.01, df = 30, p-value < 2.2e-16
Box.test(wide_data_Main$ANTM, lag = 4, fitdf = 0, type = "Lj")
##
## Box-Ljung test
##
## data: wide_data_Main$ANTM
## X-squared = 4976, df = 4, p-value < 2.2e-16
Here, we perform a Ljung-Box test on the first 24 lag autocorrelations. The resulting p-value is significant at p < .001, so this supports our ACF plot consideration above where we stated it’s likely this is not purely white noise and that some time series information exists in this data.
plot(cbind(stocks_ANTM,stocks_ANET,stocks_BA))
plot(cbind(stocks_ANTM_MY,stocks_ANET_MY,stocks_BA_MY))
plot(cbind(stocks_ANTM_old,stocks_ANET_old,stocks_BA_old))
These plots suggest that these slots the stocks improved from their position from mid of 2016 though 2018, and then it remained constant in progress untill Late 2019 and early 2020.
the trend is the long-term increase or decrease in the data. There is an increasing trend in the cement data. the seasonal pattern occurs when a time series is affected by seasonal factors such as the time of the year or the day of the week. The daily data of the stocks_ANTM doens’t show any seasonality in the graph.
the cycle occurs when the data exhibit rises and falls that are not of a fixed period. These fluctuations are usually due to economic conditions and are often related to the “business cycle”. We can see a few cycles in our in stocks_ANTM data from 2015 to 2018 and then in 2020 we have sudden drop due to covid 19. #https://afit-r.github.io/ts_exploration
Another way to look at time series data is to plot each observation against another observation that occurred some time previously. For example, we could plot yt agaisnt yt-1 . This is called a lag plot because you are plotting the time series against lags of itself.
plot(diff(stocks_ANTM))
plot(diff(stocks_BA))
plot(diff(stocks_ANET))
dif_stocks_ANTM <- diff(stocks_ANTM)
dif_stocks_ANET <- diff(stocks_ANET)
dif_stocks_BA <- diff(stocks_BA)
plot(cbind(dif_stocks_ANTM,dif_stocks_ANET,dif_stocks_BA))
# Correatliton Between the stocks
cor(stocks_BA,stocks_ANET)
## [,1]
## [1,] 0.9344986
cor(wide_data_Main$ANET,wide_data_Main$ANTM)
## [1] 0.8950863
cor(wide_data_Main_20$ANET,wide_data_Main_20$ANTM)
## [1] 0.6139195
cor(wide_data_Main_Old$ANET,wide_data_Main_Old$ANTM)
## [1] 0.9075179
cor(stocks_BA,stocks_ANTM)
## [,1]
## [1,] 0.9196915
cor(wide_data_Main$BA,wide_data_Main$ANTM)
## [1] 0.9196915
cor(wide_data_Main_20$BA,wide_data_Main_20$ANTM)
## [1] 0.8360629
cor(wide_data_Main_Old$BA,wide_data_Main_Old$ANTM)
## [1] 0.9595244
psych::pairs.panels(as.matrix(wide_data_Main[,c('ANTM','BA','ANET')]))
psych::pairs.panels(as.matrix(wide_data_Main_Old[,c('ANTM','BA','ANET')]))
psych::pairs.panels(as.matrix(wide_data_Main_20[,c('ANTM','BA','ANET')]))
class(stocks_ANTM)
## [1] "xts" "zoo"
plot.xts(stocks_ANTM)
acf(stocks_ANTM,lag.max = 30)
acf(stocks_ANET)
acf(stocks_BA)
plot(diff(as.zoo(stocks_ANTM)))
head(as.zoo(stocks_ANTM))
##
## 2015-04-28 150.15
## 2015-04-29 155.96
## 2015-04-30 151.83
## 2015-05-01 151.92
## 2015-05-04 153.45
## 2015-05-05 154.67
White Noise : Time series that show no autocorrelation are called “white noise”. Above plots shows that its of type of Random Walk model , and the (MA Model) Moving Average model should give better estimates of this index.
# For a given time series�x�we can fit the autoregressive (AR) model using the�arima()�command and setting�order�equal to�c(1, 0, 0). Note for reference that an AR model is an�ARIMA(1, 0, 0)�model.
# Fit with Full Data
# plot.ts(stocks_ANTM)
AR_ANTM <- arima(stocks_ANTM, order = c(1,0,0))
MA_ANTM <- arima(stocks_ANTM, order = c(0,0,1))
AR_ANTM_fit <- as.ts(stocks_ANTM) - resid(AR_ANTM)
MA_ANTM_fit <- as.ts(stocks_ANTM) - resid(MA_ANTM)
summary(AR_ANTM)
##
## Call:
## arima(x = stocks_ANTM, order = c(1, 0, 0))
##
## Coefficients:
## ar1 intercept
## 0.9978 222.5894
## s.e. 0.0018 45.1308
##
## sigma^2 estimated as 16.88: log likelihood = -3568.12, aic = 7142.25
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.0542719 4.108244 2.529343 -0.01210249 1.19899 1.002039
## ACF1
## Training set -0.0209907
summary(MA_ANTM)
##
## Call:
## arima(x = stocks_ANTM, order = c(0, 0, 1))
##
## Coefficients:
## ma1 intercept
## 0.9678 206.4969
## s.e. 0.0058 1.7300
##
## sigma^2 estimated as 973.9: log likelihood = -6119.59, aic = 12245.19
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.01566924 31.20706 27.96412 -4.741534 15.01558 11.07842 0.9108069
# points(AR_ANTM_fit, type = "l", col = 4, lty = 2)
# points(MA_ANTM_fit, type = "l", col = 3, lty = 3)
ggplot(stocks_ANTM, aes(x = index(stocks_ANTM))) +
geom_line(aes(y= coredata(stocks_ANTM) , color="BASE")) +
geom_line(aes(y = AR_ANTM_fit, color = "AR Fit")) +
geom_line(aes(y = MA_ANTM_fit, color = "MA Fit"))+
ggtitle("Anthem(ANTM) from 2015-20") +
scale_x_date(date_labels = "%b %y", date_breaks = "3 months")+
xlab("Date") + ylab("Price")
# scale_colour_manual("Series", values=c("AR Fit"="gray40", "MA Fit"="firebrick4", "BASE"="darkcyan"))
Fit with Only data After 2020
# plot.ts(stocks_ANTM_MY)
AR_ANTM_MY <- arima(stocks_ANTM_MY, order = c(1,0,0))
MA_ANTM_MY <- arima(stocks_ANTM_MY, order = c(0,0,1))
AR_ANTM_MY_fit <- as.ts(stocks_ANTM_MY) - resid(AR_ANTM_MY)
MA_ANTM_MY_fit <- as.ts(stocks_ANTM_MY) - resid(MA_ANTM_MY)
# points(AR_ANTM_MY_fit, type = "l", col = 4, lty = 2)
# points(MA_ANTM_MY_fit, type = "l", col = 3, lty = 3)
ggplot(stocks_ANTM_MY, aes(x = index(stocks_ANTM_MY))) +
geom_line(aes(y= coredata(stocks_ANTM_MY) , color="BASE")) +
geom_line(aes(y = AR_ANTM_MY_fit, color = "AR Fit")) +
geom_line(aes(y = MA_ANTM_MY_fit, color = "MA Fit"))+
ggtitle("Anthem(ANTM) from 2020") +
scale_x_date(date_labels = "%b %y", date_breaks = "3 months")+
xlab("Date") + ylab("Price")
# Fit with Data before 2020
AR_ANTM_old <- arima(stocks_ANTM_old, order = c(1,0,0))
MA_ANTM_old <- arima(stocks_ANTM_old, order = c(0,0,1))
AR_ANTM_old_fit <- as.ts(stocks_ANTM_old) - resid(AR_ANTM_old)
MA_ANTM_old_fit <- as.ts(stocks_ANTM_old) - resid(MA_ANTM_old)
ggplot(stocks_ANTM_old, aes(x = index(stocks_ANTM_old))) +
geom_line(aes(y= coredata(stocks_ANTM_old) , color="BASE")) +
geom_line(aes(y = AR_ANTM_old_fit, color = "AR Fit")) +
geom_line(aes(y = MA_ANTM_old_fit, color = "MA Fit"))+
ggtitle("Anthem(ANTM) Before 2020") +
scale_x_date(date_labels = "%b %y", date_breaks = "3 months")+
xlab("Date") + ylab("Price")
We will evalute all the data models and see its predction using both the models with Currnt Years data.
# Make a 1-step through 10-step forecast based on MA
predict(AR_ANTM,n.ahead = 10)
## $pred
## Time Series:
## Start = 1260
## End = 1269
## Frequency = 1
## [1] 267.4619 267.3640 267.2663 267.1688 267.0716 266.9745 266.8777 266.7811
## [9] 266.6846 266.5884
##
## $se
## Time Series:
## Start = 1260
## End = 1269
## Frequency = 1
## [1] 4.108244 5.803600 7.100186 8.189663 9.146360 10.008444 10.798612
## [8] 11.531670 12.217916 12.864855
# Plot the series plus the forecast and 95% prediction intervals
AR_forecasts <- predict(AR_ANTM, n.ahead = 300)$pred
AR_forecast_se <- predict(AR_ANTM, n.ahead = 300)$se
plot.ts(stocks_ANTM)
points(AR_forecasts, type = "l", col = 4,lty=2)
# points(AR_forecasts - AR_forecast_se, type = "l", col = 2, lty = 1)
points(AR_forecasts - 2*AR_forecast_se, type = "l", col = 2, lty = 1)
points(AR_forecasts + 2*AR_forecast_se, type = "l", col = 2, lty = 1)
#--------------------
library(forecast)
# We can then use the ARIMA model to make forecasts for future values of the time series, using the "forecast.
AR_ANTM_forcast <- forecast(AR_ANTM, h=30,level=c(99.5))
# We can plot the observed value of stock for the , as well as the predicted that would be predicted for these and for the next 5 days using our ARIMA(0,0,1) model,
plot(AR_ANTM_forcast)
#--------------------
# Plot of orignal data set and predicaitn of 2020 based on old data
AR_old_forecasts <- predict(AR_ANTM_old, n.ahead = 300)$pred
AR_old_forecast_se <- predict(AR_ANTM_old, n.ahead = 300)$se
plot.ts(stocks_ANTM)
points(AR_old_forecasts, type = "l", col = 4,lty=2)
points(AR_old_forecasts - 2*AR_old_forecast_se, type = "l", col = 2, lty = 1)
points(AR_old_forecasts + 2*AR_old_forecast_se, type = "l", col = 2, lty = 1)
# MA Moving Average Modege
MA_old_forecasts <- predict(MA_ANTM_old, n.ahead = 300)$pred
MA_old_forecast_se <- predict(MA_ANTM_old, n.ahead = 300)$se
plot.ts(stocks_ANTM)
points(MA_old_forecasts, type = "l", col = 4,lty=2)
points(MA_old_forecasts - 2*MA_old_forecast_se, type = "l", col = 2, lty = 1)
points(MA_old_forecasts + 2*MA_old_forecast_se, type = "l", col = 2, lty = 1)
# @----------------------
AR_MY_forecasts <- predict(AR_ANTM_MY, n.ahead = 5)$pred
AR_MY_forecast_se <- predict(AR_ANTM_MY, n.ahead = 3)$se
plot.ts(stocks_ANTM_MY)
points(AR_MY_forecasts, type = "l", col = 4,lty=2)
points(AR_MY_forecasts - 2*AR_MY_forecast_se, type = "l", col = 2, lty = 1)
points(AR_MY_forecasts + 2*AR_MY_forecast_se, type = "l", col = 2, lty = 1)
dim(coredata(stocks_ANTM_MY))
## [1] 80 1
length(AR_MY_forecasts)
## [1] 5
# REDUCE DATE TO SEE HWO IT GOES FOR MONT OG MARCH
# PLOT COR PLOT WITH OTHER SHARS ON TOP
# ggplot(stocks_ANTM_MY, aes(x = index(stocks_ANTM_MY)))
# autoplot(AR_ANTM_forcast) + geom_smooth()
# geom_line(aes(y= coredata(stocks_ANTM_MY) , color="BASE")) +
# ggtitle("Anthem(ANTM): Predicted next 30") +
# scale_x_date(date_labels = "%b %y", date_breaks = "3 months")+
# xlab("Date") + ylab("Price")
#
# geom_line(aes(y = AR_forecasts, color = "EXACT")) +
# geom_line(aes(y = AR_forecasts - 2*AR_forecast_se, color = "-Range"))+
# geom_line(aes(y = AR_forecasts + 2*AR_forecast_se, color = "+Range"))+
# ggtitle("Anthem(ANTM): Predicted next 30") +
# scale_x_date(date_labels = "%b %y", date_breaks = "3 months")+
# xlab("Date") + ylab("Price")
AIC(AR_ANTM,MA_ANTM)
## df AIC
## AR_ANTM 3 7142.246
## MA_ANTM 3 12245.188
BIC(AR_ANTM,MA_ANTM)
## df BIC
## AR_ANTM 3 7157.66
## MA_ANTM 3 12260.60
AIC(AR_ANTM_MY,MA_ANTM_MY)
## df AIC
## AR_ANTM_MY 3 608.5171
## MA_ANTM_MY 3 707.9615
BIC(AR_ANTM_MY,MA_ANTM_MY)
## df BIC
## AR_ANTM_MY 3 615.6632
## MA_ANTM_MY 3 715.1076
AIC(AR_ANTM_old,MA_ANTM_old)
## df AIC
## AR_ANTM_old 3 6151.07
## MA_ANTM_old 3 11435.55
BIC(AR_ANTM_old,MA_ANTM_old)
## df BIC
## AR_ANTM_old 3 6166.287
## MA_ANTM_old 3 11450.772